bold
italics
Load packages
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
source("functions.R")
I downloaded the file and loaded it into R
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder-FiveYearData.csv", destfile = "data/gapminder-FiveYearData.csv")
gapminder <- read.csv("data/gapminder-FiveYearData.csv")
head(gapminder)
## country year pop continent lifeExp gdpPercap
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
I wonder if rstats increases life expectancy over the years
p <- ggplot(data=gapminder,aes(x=year,y=lifeExp)) +
geom_point()
p
Let’s see the interactive version
ggplotly(p)
If you are repeating yourself in your code, you may be able to solve that problem by making your own function!
cars <- c(3,4,5,6,7,10)
se(cars)
## [1] 1.013794
dplyrYou will likely want to get subsections of your dataframe and/or calculate means of a variable for a certain subsection, dplyr is your friend!
Explored select
gapminder <- read.csv("data/gapminder-FiveYearData.csv")
year_country_gdp <- select(gapminder,year, country, gdpPercap)
year_country_gdp <- select(gapminder,-pop, -continent, -lifeExp)
names(year_country_gdp)
## [1] "country" "year" "gdpPercap"
Explore filter
euro <- filter(gapminder,continent=="Europe")
year_country_gdp_euro <- select(euro,year, country, gdpPercap)
year_country_gdp_euro <- gapminder %>%
filter(continent=="Europe") %>%
select(year, country, gdpPercap)
exploring the amazing group_by and summarize functions
mean_gdp_percountry <- gapminder %>%
group_by(country) %>%
summarise(mean_gdp=mean(gdpPercap),
se_gdp=se(gdpPercap))
mean_gdp_percountry
## # A tibble: 142 x 3
## country mean_gdp se_gdp
## <fctr> <dbl> <dbl>
## 1 Afghanistan 802.6746 31.23550
## 2 Albania 3255.3666 344.20223
## 3 Algeria 4426.0260 378.26190
## 4 Angola 3607.1005 336.56641
## 5 Argentina 8955.5538 537.68144
## 6 Australia 19980.5956 2256.11315
## 7 Austria 20411.9163 2787.23968
## 8 Bahrain 18077.6639 1563.29518
## 9 Bangladesh 817.5588 67.86165
## 10 Belgium 19900.7581 2422.32683
## # ... with 132 more rows
Challenge: I want the mean, se, and sample size of life expetancy by continent
mean_se_life_percontinent<-gapminder %>%
group_by(continent,country) %>%
summarise(mean_life=mean(lifeExp),
se_life=se(lifeExp),
samsize_life=n())
mean_se_life_percontinent
## # A tibble: 142 x 5
## # Groups: continent [?]
## continent country mean_life se_life samsize_life
## <fctr> <fctr> <dbl> <dbl> <int>
## 1 Africa Algeria 59.03017 2.9849208 12
## 2 Africa Angola 37.88350 1.1562236 12
## 3 Africa Benin 48.77992 1.7691977 12
## 4 Africa Botswana 54.59750 1.7116922 12
## 5 Africa Burkina Faso 44.69400 1.9762099 12
## 6 Africa Burundi 44.81733 0.9165096 12
## 7 Africa Cameroon 48.12850 1.5784640 12
## 8 Africa Central African Republic 43.86692 1.3627459 12
## 9 Africa Chad 46.77358 1.4110376 12
## 10 Africa Comoros 52.38175 2.3476081 12
## # ... with 132 more rows
combining ggplot and dplyr
euro_countries <- gapminder %>%
filter(continent=="Europe") %>%
ggplot(aes(x=year,y=lifeExp,color=country)) +
geom_line()+
facet_wrap(~country)
euro_countries
ggsave("euro.png")
## Saving 7 x 5 in image
write.csv(mean_gdp_percountry,"processed/mean_gdp_percountry.csv")
tidyr# command to download the 'wide' data
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/data/gapminder_wide.csv", destfile = "data/gapminder_wide.csv")
gapminder_wide <- read.csv("data/gapminder_wide.csv")
gap_long <- gapminder_wide %>%
gather(obstype_year,
obs_values,
3:38)
head(gap_long)
## continent country obstype_year obs_values
## 1 Africa Algeria gdpPercap_1952 2449.0082
## 2 Africa Angola gdpPercap_1952 3520.6103
## 3 Africa Benin gdpPercap_1952 1062.7522
## 4 Africa Botswana gdpPercap_1952 851.2411
## 5 Africa Burkina Faso gdpPercap_1952 543.2552
## 6 Africa Burundi gdpPercap_1952 339.2965
separate the obs_type column
gap_normal <- gap_long %>%
separate(obstype_year,into=c("obs_type","year"),sep="_") %>%
spread(obs_type,obs_values)
head(gap_normal)
## continent country year gdpPercap lifeExp pop
## 1 Africa Algeria 1952 2449.008 43.077 9279525
## 2 Africa Algeria 1957 3013.976 45.685 10270856
## 3 Africa Algeria 1962 2550.817 48.303 11000948
## 4 Africa Algeria 1967 3246.992 51.407 12760499
## 5 Africa Algeria 1972 4182.664 54.518 14760787
## 6 Africa Algeria 1977 4910.417 58.014 17152804
all.equal(gapminder,gap_normal)
## [1] "Names: 5 string mismatches"
## [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
## [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"
## [4] "Component 1: 1704 string mismatches"
## [5] "Component 2: Attributes: < target is NULL, current is list >"
## [6] "Component 2: target is numeric, current is factor"
## [7] "Component 3: Modes: numeric, character"
## [8] "Component 3: target is numeric, current is character"
## [9] "Component 4: 'current' is not a factor"
## [10] "Component \"lifeExp\": Mean relative difference: 0.203822"
## [11] "Component 6: Mean relative difference: 4101.546"
gap_normal <- gap_normal %>%
arrange(country,continent,year)
all.equal(gapminder,gap_normal)
## [1] "Names: 5 string mismatches"
## [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
## [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"
## [4] "Component 1: 1704 string mismatches"
## [5] "Component 2: Attributes: < target is NULL, current is list >"
## [6] "Component 2: target is numeric, current is factor"
## [7] "Component 3: Modes: numeric, character"
## [8] "Component 3: target is numeric, current is character"
## [9] "Component 4: 'current' is not a factor"
## [10] "Component 6: Mean relative difference: 4101.546"